/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import net.nutch.net.protocols.Response;
import net.nutch.io.ArrayFile;
import net.nutch.fetcher.HostQueue.HostQueueKey;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.net.protocols.http.Http;
import net.nutch.net.protocols.http.MiscHttpAccounting;
import net.nutch.net.protocols.ftp.Ftp;
import net.nutch.util.FibonacciHeap;
import net.nutch.util.NutchConf;
import net.nutch.util.TrieStringMatcher;
import net.nutch.util.SoftHashMap;
import net.nutch.util.StringUtil;
import net.nutch.util.SuffixStringMatcher;
import net.nutch.util.LogFormatter;
import java.io.File;
import java.io.LineNumberReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.LinkedHashSet;
import java.util.StringTokenizer;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Handler;
/**
* This class is responsible for reading from the
* <code>fetchList</code> DB, and coordinating the activity of {@link
* FetcherThread}s and {@link OutputThread}s.
*
* <p>
*
* A <code>RequestScheduler</code> reads records from the
* <code>fetchList</code>, and parcels them out to {@link HostQueue}s.
* <code>HostQueues</code> are polled for waiting requests when
* <code>FetcherThread</code>s are idle, and later notified when
* a request is completed.
*
* <p>
*
* Completed requests are queued for output.
* <code>OutputThread</code>s poll the <code>RequestScheduler</code>
* for finished requests to process.
*/
public class RequestScheduler implements FetcherConstants {
public static final Logger LOG=
LogFormatter.getLogger("net.nutch.fetcher.RequestScheduler");
public static final String NEWLINE_STRING=
System.getProperty("line.separator");
public static final int WAIT_TIMEOUT= 15 * 1000;
// Configuration parameters- these all default to extremely
// conservative and/or rediculous values, and should be overridden
// in the configuration file.
public static final int DELAY_SECONDS=
NutchConf.getInt("fetcher.server.delay", 60);
public static final int NUM_FETCHER_THREADS=
NutchConf.getInt("fetcher.threads.fetch", 5);
public static final int NUM_OUTPUT_THREADS=
NutchConf.getInt("fetcher.threads.output", 5);
public static final int MAX_QUEUED_REQUESTS=
NutchConf.getInt("fetcher.request.queue", 2000);
public static final int MAX_OUTPUT_QUEUE=
NutchConf.getInt("fetcher.output.queue", 20);
public static final int MAX_ACTIVE_HOSTS=
NutchConf.getInt("fetcher.active.servers", 400);
public static final int MAX_CACHED_ROBOTS=
NutchConf.getInt("fetcher.robots.cache", 200);
public static final int STATS_MINUTES=
NutchConf.getInt("fetcher.stats.minutes", 2);
public static final int MAX_QUEUED_HOSTS=
MAX_CACHED_ROBOTS + MAX_ACTIVE_HOSTS;
public static final int MAX_HOSTQUEUE_LENGTH=
NutchConf.getInt("fetcher.server.maxurls", 1000);
public static final int LOW_ACTIVE_QUEUES=
NutchConf.getInt("fetcher.lowservers.threshold", 10);
public static final int LOW_ACTIVE_QUEUES_MAX_LENGTH=
NutchConf.getInt("fetcher.lowservers.maxurls", 100);
public static final int MAX_PAGE_ERRORS=
NutchConf.getInt("fetcher.retry.max", 3);
public static final int MAX_PAGE_REDIRECTS=
NutchConf.getInt("fetcher.redirect.max", 3);
private static final String AGENT_NAME=
NutchConf.get("http.agent.name");
private static final int THROTTLE_PERIOD_SECONDS=
NutchConf.getInt("fetcher.throttle.period", -1);
private static final int THROTTLE_MAX_BANDWIDTH=
NutchConf.getInt("fetcher.throttle.bandwidth", -1);
private static final int THROTTLE_INITIAL_THREADS=
NutchConf.getInt("fetcher.throttle.initial.threads", 1);
// Setting this above one is dangerous- you are likely to draw the
// ire of many webmasters. You should only adjust this if you
// really know what you're doing and have permission from the sites
// you'll be hitting.
private static final int MAX_CONCURRENT_REQUESTS_TO_A_SINGLE_SERVER= 1;
public static final long SECONDS_TO_MS_MULTIPLIER= 1000;
// controls behavior
private long msDelay;
private int maxPageErrors;
private int maxPageRedirects;
private int numFetchThreads;
private int numOutputThreads;
private int maxOutputQueue;
private int maxQueuedRequests;
private int maxQueuedHosts;
private int maxCachedRobots;
private long throttlePeriod;
private int throttleMaxBandwidth;
private int throttleInitialThreads;
private HashMap allHostQueues; // contains all HostQueues
// each HostQueue is also in exactly one of the following structures:
// hosts with max # of requests ongoing
private HashSet busyHostQueues;
// the time when we last ran checkQueues
private long lastCheckQueues;
// current time, reset on all calls to returnRequestAndGetNext
private long now;
// hosts with "< max #" of requests ongoing / in delay
private FibonacciHeap readyHostQueues;
// hosts with "< max #" of requests ongoing, remainder in delay
private LinkedList delayHostQueues;
// hosts with no URLs left (w/cached robots.txt, dead-status, etc)
private LinkedHashSet idleHostQueues;
// cache of hosts which have fallen out of idle queue
private SoftHashMap hostQueueCache;
// requests which are ready for OutputThreads, but have not yet
// reached the outputQueue
private LinkedList pendingOutputQueue;
// requests which are ready for OutputThreads.
// NOTE: This accesses to this object must be synchronized on it-
// if a thread needs to synchronized on "this" and the outputQueue object,
// "this" should be synchronized first, then "outputQueue".
private LinkedList outputQueue;
// number of fetchList requests held by HostQueues
private int numQueuedRequests;
// number of fetchList requests held by FetcherThreads
private int numOutstandingRequests;
// The input and output DBs
private ArrayFile.Reader fetchList;
private ArrayFile.Writer fetcherDb;
private ArrayFile.Writer rawDb;
private ArrayFile.Writer strippedDb;
// have we exhausted the fetchList?
private boolean fetchListEmpty;
// have we finished processing all requests from the fetchlist?
private boolean finishedRequests;
// Robots rules parser for our HostQueues to use
private RobotRulesParser robotRulesParser;
private TrieStringMatcher hostNameBans[];
private FetcherStatus overallFetcherStatus;
private String agentString;
private boolean aborted;
public RequestScheduler(ArrayFile.Reader fetchList,
ArrayFile.Writer fetcherDb,
ArrayFile.Writer rawDb,
ArrayFile.Writer strippedDb) {
this.fetchList= fetchList;
this.fetcherDb= fetcherDb;
this.rawDb= rawDb;
this.strippedDb= strippedDb;
this.msDelay= DELAY_SECONDS * SECONDS_TO_MS_MULTIPLIER;
this.numFetchThreads= NUM_FETCHER_THREADS;
this.numOutputThreads= NUM_OUTPUT_THREADS;
this.maxQueuedRequests= MAX_QUEUED_REQUESTS;
this.maxOutputQueue= MAX_OUTPUT_QUEUE;
this.maxQueuedHosts= MAX_QUEUED_HOSTS;
this.maxCachedRobots= MAX_CACHED_ROBOTS;
this.maxPageErrors= MAX_PAGE_ERRORS;
this.maxPageRedirects= MAX_PAGE_REDIRECTS;
this.throttlePeriod= THROTTLE_PERIOD_SECONDS;
this.throttleMaxBandwidth= THROTTLE_MAX_BANDWIDTH;
if (throttleMaxBandwidth >= 0)
this.throttleInitialThreads= THROTTLE_INITIAL_THREADS;
else
this.throttleInitialThreads= numFetchThreads;
this.overallFetcherStatus= new FetcherStatus();
this.aborted= false;
numQueuedRequests= 0;
numOutstandingRequests= 0;
fetchListEmpty= false;
finishedRequests= false;
lastCheckQueues= 0;
allHostQueues= new HashMap();
busyHostQueues= new HashSet();
readyHostQueues= new FibonacciHeap();
delayHostQueues= new LinkedList();
idleHostQueues= new LinkedHashSet();
hostQueueCache= new SoftHashMap();
pendingOutputQueue= new LinkedList();
outputQueue= new LinkedList();
FetcherStatus.logKeys();
// build robotRulesParser
String allAgentNames= NutchConf.get("http.robots.agents");
StringTokenizer tok= new StringTokenizer(allAgentNames, ",");
ArrayList agents= new ArrayList();
while (tok.hasMoreTokens()) {
agents.add(tok.nextToken().trim());
}
if (agents.size() == 0) {
agents.add(AGENT_NAME);
LOG.severe("No agents listed in 'http.robots.agents' property!");
} else if (!((String)agents.get(0)).equalsIgnoreCase(AGENT_NAME)) {
agents.add(0, AGENT_NAME);
LOG.severe("Agent we advertise (" + AGENT_NAME
+ ") not listed first in 'http.robots.agents' property!");
}
String[] agentStrings= (String[])
agents.toArray(new String[agents.size()]);
robotRulesParser= new RobotRulesParser(agentStrings);
FetcherStatus.logTraceMisc(MISC_INFORMATIONAL,
"Robots.txt entries we'll obey (in order):");
for (int i= 0; i < agentStrings.length; i++)
FetcherStatus.logTraceMisc(MISC_INFORMATIONAL, agentStrings[i]);
// build agent string
String agentName = NutchConf.get("http.agent.name");
String agentVersion = NutchConf.get("http.agent.version");
String agentDesc = NutchConf.get("http.agent.description");
String agentURL = NutchConf.get("http.agent.url");
String agentEmail = NutchConf.get("http.agent.email");
if ( (agentName == null) || (agentName.trim().length() == 0) )
LOG.severe("No User-Agent string set (http.agent.name)!");
StringBuffer buf= new StringBuffer();
buf.append(agentName);
if (agentVersion != null) {
buf.append("/");
buf.append(agentVersion);
}
if ( ((agentDesc != null) && (agentDesc.length() != 0))
|| ((agentEmail != null) && (agentEmail.length() != 0))
|| ((agentURL != null) && (agentURL.length() != 0)) ) {
buf.append(" (");
if ((agentDesc != null) && (agentDesc.length() != 0)) {
buf.append(agentDesc);
if ( (agentURL != null) || (agentEmail != null) )
buf.append("; ");
}
if ((agentURL != null) && (agentURL.length() != 0)) {
buf.append(agentURL);
if (agentEmail != null)
buf.append("; ");
}
if ((agentEmail != null) && (agentEmail.length() != 0))
buf.append(agentEmail);
buf.append(")");
}
this.agentString= buf.toString();
FetcherStatus.logTraceMisc(MISC_INFORMATIONAL,
"User-Agent string is: " + buf.toString());
// load hostNameBans
ArrayList bans= new ArrayList();
try {
LineNumberReader reader=
new LineNumberReader(
NutchConf.getConfResourceAsReader(
NutchConf.get("excludehosts.suffix.file")));
ArrayList suffixStrings= new ArrayList();
String line;
while ( (line= reader.readLine()) != null) {
// trim out comments and whitespace
int hashPos= line.indexOf("#");
if (hashPos >= 0)
line= line.substring(0, hashPos);
line= line.trim();
if (line.length() > 0) {
line= line.toLowerCase();
suffixStrings.add(line);
}
}
bans.add(new SuffixStringMatcher(suffixStrings));
} catch (Exception e) {
LOG.warning("Not using hostNameSuffixBans: " + e.toString());
}
if (bans.size() > 0)
hostNameBans= (TrieStringMatcher[])
bans.toArray(new TrieStringMatcher[bans.size()]);
else
hostNameBans= null;
}
/**
* Returns a {@link RobotRulesParser} with an appropriate
* <code>robotName</code> setting.
*
* <p>
*
* This method is intended for use by {@link HostQueue}s.
*/
public RobotRulesParser getRobotRulesParser() {
return robotRulesParser;
}
/**
* Returns a suitable User-Agent string for our robot.
*/
public String getAgentString() {
return agentString;
}
/**
* Returns the number of concurrent requests we allow to a given
* server.
*
* <p>
*
* This method is intended for use by {@link HostQueue}s.
*/
public final int getMaxConcurrentRequests() {
return MAX_CONCURRENT_REQUESTS_TO_A_SINGLE_SERVER;
}
/**
* Returns the number of milliseconds we delay between requests to
* the same host.
*
* <p>
*
* This method is intended for use by {@link HostQueue}s.
*/
public long getMsDelay() {
return msDelay;
}
private void primeQueue() {
while ( !fetchListEmpty
&& (allHostQueues.size() < maxQueuedHosts)
&& (numQueuedRequests < maxQueuedRequests) ) {
addRequest();
}
}
private void addRequest() {
FetchListEntry fle= null;
try {
fle = (FetchListEntry)fetchList.next(new FetchListEntry());
} catch (java.io.IOException e) {
LOG.severe("Got exception while iterating through FetchList:");
LOG.severe(e.toString());
LOG.severe("Giving up and treating it as empty");
fetchListEmpty= true;
return;
}
if (fle == null) {
fetchListEmpty= true;
return;
}
overallFetcherStatus.readFromFetchlist();
String urlString= null;
URL url= null;
try {
urlString= fle.getPage().getURL().toString();
url= new URL(urlString);
} catch (Exception e) {
LOG.warning("not fetching " + urlString + " due to exception:");
LOG.warning(e.toString());
RequestRecord request= new RequestRecord(fle, true);
request.setFailureReason(FAIL_BAD_URL);
request.setFailureMessages(new String[] {urlString});
handleFailedFetch(request);
return;
}
if (!fle.getFetch()) {
if (LOG.isLoggable(Level.FINEST))
LOG.finest("not supposed to fetch " + fle.getPage().getURL());
enqueueOutput(new RequestRecord(url, fle, false));
return;
}
if (hostNameBans != null) {
String hostName= url.getHost();
hostName= hostName.toLowerCase();
for (int i= 0; i < hostNameBans.length; i++)
if (hostNameBans[i].matches(hostName)) {
RequestRecord request= new RequestRecord(url, fle, true);
request.setFailureReason(FAIL_HOSTNAME_BANNED);
handleFailedFetch(request);
return;
}
}
queueNewRequest(new RequestRecord(url, fle, null));
}
private void queueNewRequest(RequestRecord request) {
URL url= request.getURL();
boolean newHostQueue= false;
HostQueue queue= request.getHostQueue(); // redirs will have this set
if (queue == null) {
HostQueueKey key= new HostQueueKey(url.getProtocol(), url.getHost(), url.getPort());
queue= (HostQueue) allHostQueues.get(key);
if (queue == null) {
queue= (HostQueue) hostQueueCache.remove(key);
if (queue != null) {
allHostQueues.put(key, queue);
delayHostQueues.add(queue); //safest place to add
}
}
if (queue == null) {
queue= new HostQueue(key, this);
allHostQueues.put(key, queue);
readyHostQueues.add(queue, -queue.size());
newHostQueue= true;
}
request.setHostQueue(queue);
}
// fixme: once there is a mechanism to "defer" a page,
// we should mark page as deferred, not drop on floor!!
if (queue.size() >= MAX_HOSTQUEUE_LENGTH) {
// if it's not a robots.txt request, and not a redirect (ie. no
// other HostQueues can possibly be waiting for it), just drop
// it on the floor
if ( (!request.isRobotsRequest()) &&
(request.getParentRequest() == null) ) {
overallFetcherStatus.droppedOnFloor(request);
return;
}
}
queue.addRequest(request);
if (!request.isRobotsRequest())
// no accounting on robots.txt files
// this is a robots redirect- requeue it if it's on the same host
numQueuedRequests++;
if (!newHostQueue) {
// find it and put it in appropriate place
if (readyHostQueues.contains(queue)) {
readyHostQueues.decreaseKey(queue, -queue.size());
return;
}
if (idleHostQueues.contains(queue)) {
idleHostQueues.remove(queue);
if (queue.requestReady())
readyHostQueues.add(queue, -queue.size());
else
delayHostQueues.add(queue);
return;
}
// otherwise it's busy or in delay- leave it!
}
}
// output handling
// pushes all pendingOutputQueue items into outputQueue.
// caller should hold lock on this
private void enqueuePendingOutput() {
int numAdded= pendingOutputQueue.size();
if (numAdded == 0)
return;
int prevSize;
int newSize;
synchronized (outputQueue) {
prevSize= outputQueue.size();
outputQueue.addAll(pendingOutputQueue);
pendingOutputQueue.clear();
overallFetcherStatus.incrementOutputQueueAdd(numAdded);
newSize= outputQueue.size();
if (prevSize <= MAX_HOSTQUEUE_LENGTH) {
for (int i= 0; i < numAdded; i++)
outputQueue.notify();
}
if (newSize > MAX_HOSTQUEUE_LENGTH) {
try {
overallFetcherStatus.incrementOutputQueueFull();
outputQueue.wait(WAIT_TIMEOUT);
} catch (InterruptedException e) {
;
}
}
}
}
// adds request to pendingOutputQueue- enqueuePendingOutput() must be
// called after all calls to enqueueOutput() have been made.
// caller should hold lock on this
private void enqueueOutput(RequestRecord request) {
pendingOutputQueue.addLast(request);
}
/**
* Returns true if there are no remaining requests that may need to
* be sent to an {@link OutputThread}.
*/
public boolean finishedOutput() {
if (LogFormatter.hasLoggedSevere()) {
aborted= true;
return true;
}
// do lightweight checks first- get lock and do final check if
// there's a chance we're done
if (!fetchListEmpty)
return false;
if (!finishedRequests)
return false;
synchronized (outputQueue) {
if (finishedRequests && (outputQueue.size() == 0) )
return true;
else
return false;
}
}
/**
* If <code>finishedRequest</code> is not null, it is "returned" to
* the scheduler as having been output. The next request that is
* ready to be output (or <code>null</code> if there are no such
* requests) is returned.
*/
public RequestRecord returnOutputAndGetNext(RequestRecord finishedRequest,
String finishedUrlString) {
RequestRecord nextRequest= null;
boolean done= finishedOutput();
synchronized (outputQueue) {
LOG.finest("returnOutputAndGetNext: got outputQueue lock, returning"
+ " request");
if (finishedRequest != null) {
// this is in the synchronized block so we can have have
// a set of mutexes around overallFetcherStatus- needed
// for bandwidth-throttling
overallFetcherStatus.outputStatus(finishedRequest, finishedUrlString);
}
// get the next request to output
if (LOG.isLoggable(Level.FINEST))
LOG.finest("returnOutputAndGetNext: outputQueue: "
+ outputQueue.size());
if (outputQueue.size() == 0) {
if (!done) {
overallFetcherStatus.incrementOutputQueueEmpty();
try {
LOG.finest("returnOutputAndGetNext: going to wait");
outputQueue.wait(WAIT_TIMEOUT);
} catch (InterruptedException e) {
;
}
LOG.finest("returnOutputAndGetNext: done wait");
} else {
// we are done- wake all waiters
outputQueue.notifyAll();
}
} else {
LOG.finest("returnOutputAndGetNext: popping immediately");
overallFetcherStatus.incrementOutputQueuePopNoDelay();
outputQueue.notify();
}
if (outputQueue.size() != 0) {
nextRequest= (RequestRecord) outputQueue.removeFirst();
overallFetcherStatus.incrementOutputQueuePopped();
LOG.finest("returnOutputAndGetNext: popped ");
}
}
return nextRequest;
}
/**
* Returns true if all requests from the <code>fetchList</code> have
* been processed by {@link FetcherThread}s, false otherwise.
*/
public boolean finishedRequests() {
if (LogFormatter.hasLoggedSevere()) {
aborted= true;
return true;
}
// do lightweight checks first- get locks and do more checks if
// there's a chance we're done
if (!fetchListEmpty)
return false;
synchronized (this) {
// fixme:
// kill all the queues before we're so bold as to declare finished?
if ( fetchListEmpty
&& (numQueuedRequests == 0)
&& (busyHostQueues.size() == 0) // implies ready/delay q's are empty
&& (numOutstandingRequests == 0) ) {
finishedRequests= true;
return true;
}
if (LOG.isLoggable(Level.FINEST))
LOG.finest("fetchListEmpty: " + fetchListEmpty
+ " numQueuedRequests: " + numQueuedRequests);
return false;
}
}
private void checkQueues() {
while (delayHostQueues.size() > 0) {
HostQueue queue= (HostQueue) delayHostQueues.getFirst();
if (queue.requestReady()) {
delayHostQueues.removeFirst();
readyHostQueues.add(queue, -queue.size());
} else if (queue.isFinished()) {
delayHostQueues.removeFirst();
idleHostQueues.add(queue);
} else if (!queue.delaysPending()) {
// must be waiting for redirected robots or somesuch
delayHostQueues.removeFirst();
delayHostQueues.add(queue);
// LOG.fine("requeueing host: " + queue.getKey().toString());
break;
} else
// LOG.fine("blocked on delay host: " + queue.getKey().toString());
// delays are pending
break;
}
// do this once to see if we re-populate an 'idle' queue
primeQueue();
// kill some idle queues
Iterator iter= idleHostQueues.iterator();
while (idleHostQueues.size() > maxCachedRobots) {
HostQueue queue= (HostQueue) iter.next();
iter.remove();
hostQueueCache.put(queue.getKey(), queue);
if (!queue.isFinished()) {
LOG.warning("Warning: queue " + queue.getKey() + " in idleQueue"
+ " but is not finished!");
// safest place to add...
delayHostQueues.add(queue);
} else {
if (LOG.isLoggable(Level.FINEST))
LOG.finest("disposing of idle queue " + queue.getKey());
if (allHostQueues.remove(queue.getKey()) != queue) {
LOG.warning("Warning: queue " + queue.getKey() + " in idleQueue"
+ " but not in allHostQueues!");
}
}
}
// prime again to replace any idle queues we threw out
primeQueue();
if ( ( readyHostQueues.size() + idleHostQueues.size()
+ delayHostQueues.size() + busyHostQueues.size())
!= allHostQueues.size())
LOG.warning(" BAD allHostQueues.size() is: " +
allHostQueues.size() + ", should be: "
+ ( readyHostQueues.size() + idleHostQueues.size()
+ delayHostQueues.size() + busyHostQueues.size()) );
}
/**
* Returns the next request waiting for processing by a {@link
* FetcherThread}, or <code>null</code> if no such request exists.
*/
private synchronized RequestRecord getNextRequest() {
overallFetcherStatus.incrementGetRequestAttempts();
if (LOG.isLoggable(Level.FINE))
LOG.fine("ready: " + readyHostQueues.size()
+ " idle: " + idleHostQueues.size()
+ " delay: " + delayHostQueues.size()
+ " busy: " + busyHostQueues.size()
+ " total: " + allHostQueues.size());
// fixme: remove this sometime..
if ( ( readyHostQueues.size() + idleHostQueues.size()
+ delayHostQueues.size() + busyHostQueues.size())
!= allHostQueues.size())
LOG.severe("ready: " + readyHostQueues.size()
+ " idle: " + idleHostQueues.size()
+ " delay: " + delayHostQueues.size()
+ " busy: " + busyHostQueues.size()
+ " BADTOTAL: " + allHostQueues.size());
// clean up queues and read more requests if there are no
// ready queues or a second has passed
if ( (readyHostQueues.size() == 0)
|| ((lastCheckQueues - now) < SECONDS_TO_MS_MULTIPLIER) ) {
lastCheckQueues= now;
checkQueues();
}
// check if we have anything that seems ready
if (readyHostQueues.size() == 0) {
if ( (busyHostQueues.size() != 0)
|| (delayHostQueues.size() != 0) )
overallFetcherStatus.incrementGetRequestAllBusy();
return null;
}
return getNextRequestHelper();
}
private RequestRecord getNextRequestHelper() {
while (readyHostQueues.size() > 0) {
HostQueue queue= (HostQueue) readyHostQueues.popMin();
// fixme: once there is a mechanism to "defer" a page,
// we should mark page as deferred, not drop on floor!!
if (readyHostQueues.size() + busyHostQueues.size()
+ delayHostQueues.size() < LOW_ACTIVE_QUEUES) {
while (queue.size() > LOW_ACTIVE_QUEUES_MAX_LENGTH) {
RequestRecord request= queue.killRequest();
if (request == null)
break;
numQueuedRequests--;
overallFetcherStatus.droppedOnFloor(request);
// drop request on floor
}
}
if (!queue.requestReady()) {
LOG.warning("queue " + queue.getKey() + " in readyQueue"
+ " but is not ready!");
if (queue.isFinished()) {
idleHostQueues.add(queue);
} else {
delayHostQueues.add(queue); // safest place to add
}
overallFetcherStatus.incrementGetRequestFoundNotReady();
return null;
}
RequestRecord request= queue.getNextRequest();
if (request == null) {
LOG.warning("queue " + queue.getKey() + " in ready queue, but not"
+ " ready!");
if (!queue.isFinished()) { // robots.txt expired?
delayHostQueues.add(queue);
} else {
LOG.warning("Warning: finished queue " + queue.getKey()
+ " in ready queue");
idleHostQueues.add(queue);
}
overallFetcherStatus.incrementGetRequestFoundNotReady();
return null;
}
overallFetcherStatus.dispatchingToFetcherThread(request);
if (!request.isRobotsRequest()) {
numQueuedRequests--;
}
if (request.getHasFailed()) {
// robots.txt excluded it, make host ready immediately if we can
if (queue.requestReady()) {
readyHostQueues.add(queue, -queue.size());
} else if (queue.isFinished()) {
idleHostQueues.add(queue);
} else {
// always safe to add to delay q
delayHostQueues.add(queue);
}
handleFailedFetch(request);
overallFetcherStatus.incrementGetRequestFoundExcluded();
continue;
}
if (LOG.isLoggable(Level.FINE))
LOG.fine("got " + request.getURLString() + ", ready= "
+ queue.requestReady());
if (queue.requestReady())
readyHostQueues.add(queue, -queue.size());
else if (queue.delaysPending())
delayHostQueues.add(queue);
else
busyHostQueues.add(queue);
if (LOG.isLoggable(Level.FINE))
LOG.fine("numOutstandingRequests: " + numOutstandingRequests);
if (!request.isRobotsRequest()) {
numOutstandingRequests++;
if (LOG.isLoggable(Level.FINE))
LOG.fine("incremented numOutstandingRequests ("
+ numOutstandingRequests + "): " + request.getURLString());
}
overallFetcherStatus.incrementGetRequestSuccesses();
return request;
}
return null;
}
/**
* Notifies this <code>RequestScheduler</code> that an attempt has
* been made to fetch the supplied<code>request</code>. FetcherThreads
* must call this once for each <code>RequestRecord</code> they
* obtain from a call to {@link #getNextRequest()}. The
* <code>Http.BytesTransferredCounter</code> should include
* transfer counts for just the last fetch attempt made.
*/
public void returnRequest(
RequestRecord request, MiscHttpAccounting httpAccounting ) {
synchronized (this) {
unsyncReturnRequest(request, httpAccounting);
}
}
// a private version of returnRequest, which requires external
// synchronization on this.
private void unsyncReturnRequest(
RequestRecord request, MiscHttpAccounting httpAccounting ) {
if (request.getResponse() != null) {
if (LOG.isLoggable(Level.FINE))
LOG.fine("FetcherThread returned: " + request.getURLString()
+ " completed: true code:"
+ request.getResponse().getCode());
} else {
if (LOG.isLoggable(Level.FINE))
LOG.fine("FetcherThread returned: " + request.getURLString()
+ " completed: false" );
}
HostQueue queue= request.getHostQueue();
if (busyHostQueues.contains(queue)) {
// could also be in delay queue or ready queue already
busyHostQueues.remove(queue);
delayHostQueues.add(queue);
}
Response response= request.getResponse();
if (!request.isRobotsRequest())
numOutstandingRequests--;
if (request.getHasFailed()) {
handleFailedFetch(request);
return;
}
if (response == null) {
// fetch failed, can retry
handleUnsuccessfulFetchAttempt(request);
return;
}
overallFetcherStatus.incrementRawBytes(httpAccounting.getBytesSent(),
httpAccounting.getBytesRead());
overallFetcherStatus.incrementContinues(response.getNumContinues());
int code= response.getCode();
if (code == 200) {
handleSuccessFetch(request);
return;
}
if (code >= 300 && code < 400) { // handle redirect
handleRedirectedFetch(request);
return;
}
if (code == 404) { // handle doesn't exist
request.setFailureReason(FAIL_NOT_FOUND);
handleFailedFetch(request);
return;
}
if (code >= 400 && code < 500) { // handle permission error
request.setFailureReason(FAIL_FORBIDDEN);
handleFailedFetch(request);
return;
}
request.setFailureReason(FAIL_UNKNOWN_RESP_CODE);
request.setFailureMessages(new String[] { Integer.toString(code) });
// fetch failed, won't retry
handleFailedFetch(request);
}
/**
* Returns the time of the last call to getNextRequest()- this is
* useful for calculating delays, etc. This method can be called in
* place of repeated <code>new Date().getTime()</code> incantations.
* This time is guaranteed to be in the past, and after the last
* request was returned.
*/
public long getTime() {
return now;
}
/**
* Notifies this <code>RequestScheduler</code> that an attempt has
* been made to fetch the supplied<code>request</code>. FetcherThreads
* must call this once for each <code>RequestRecord</code> they
* obtain from a call to {@link #getNextRequest()}. The
* <code>Http.BytesTransferredCounter</code> should include
* transfer counts for just the last fetch attempt made.
* Returns the next request waiting for processing by a {@link
* FetcherThread}, or <code>null</code> if no such request exists.
*/
public RequestRecord returnRequestAndGetNext(
RequestRecord retRequest, MiscHttpAccounting httpAccounting ) {
now= new Date().getTime();
RequestRecord nextRequest= null;
synchronized (this) {
// return the request
if (retRequest != null)
unsyncReturnRequest(retRequest, httpAccounting);
// now get next request
if (!finishedRequests())
nextRequest= getNextRequest();
// push output items into output queue
enqueuePendingOutput();
}
return nextRequest;
}
private void handleFailedFetch(RequestRecord request) {
// tell HostQueue this request is done
request.setHasFailed(true);
if (LOG.isLoggable(Level.FINEST))
LOG.finest("notifyQueuesOfCompletion: ");
request.notifyQueuesOfCompletion();
overallFetcherStatus.requestFailed(request);
// queue output
if (!request.isRobotsRequest())
enqueueOutput(request);
}
private void handleUnsuccessfulFetchAttempt(RequestRecord request) {
overallFetcherStatus.requestError(request);
request.incrementErrors();
if (request.getNumErrors() >= maxPageErrors) {
request.setFailureReason(FAIL_TOO_MANY_ERRORS);
handleFailedFetch(request);
return;
}
overallFetcherStatus.retry(request);
request.notifyQueuesOfCompletion();
// reset
request.setErrorReason(ERR_UNKNOWN);
// fixme: should have better re-enqueue strategy
queueNewRequest(request);
}
private void handleSuccessFetch(RequestRecord request) {
// tell HostQueue this request is done
request.notifyQueuesOfCompletion();
overallFetcherStatus.succeeded(request);
// queue output
if (!request.isRobotsRequest()) {
enqueueOutput(request);
}
}
private void handleRedirectedFetch(RequestRecord request) {
Response response= request.getResponse();
URL target= null;
try {
target = new URL(request.getURL(), response.getHeader("Location"));
} catch (Exception e) {
;
}
// too many redirects?
if (LOG.isLoggable(Level.FINE))
LOG.fine("code is 3xx, target is " + target);
if ( (request.getNumRedirects() == maxPageRedirects)
|| (target == null) ) {
if (request.getNumRedirects() == maxPageRedirects) {
request.setFailureReason(FAIL_TOO_MANY_REDIRECTS);
} else if (target == null) {
request.setFailureReason(FAIL_REDIRECT_MISSING_TARGET);
}
handleFailedFetch(request);
return;
}
// redirect loop?
RequestRecord tmp= request;
while (tmp != null) {
if (target.toString().equals(tmp.getURLString())) {
// loop!
request.setFailureReason(FAIL_REDIRECT_LOOP_DETECTED);
// request.setFailureMessages(new String[] {
// tmp.getURL().toString(), target.toString() } );
handleFailedFetch(request);
return;
}
tmp= tmp.getParentRequest();
}
// LOG.fine("redirecting " + request.getURLString() + " to " + target);
overallFetcherStatus.redirected(request);
request.incrementRedirects();
// fixme: should have better re-enqueue strategy
request.notifyQueuesOfCompletion();
request= new RequestRecord(request, target, null);
queueNewRequest(request);
}
/**
* Logs current state information, such as HostQueue queue sizes
* (readyQueue, delayQueue, etc), the number of queued requests,
* etc. This information is aquired asynchronously, so all counts
* may not be consistent.
*/
public void logState() {
int code= MISC_STATS;
FetcherStatus.logTraceMisc(code, "HostQueue sizes:");
FetcherStatus.logTraceMisc(code, "\tready: " + readyHostQueues.size());
FetcherStatus.logTraceMisc(code, "\tidle: " + idleHostQueues.size());
FetcherStatus.logTraceMisc(code, "\tdelay: " + delayHostQueues.size());
FetcherStatus.logTraceMisc(code, "\tbusy: " + busyHostQueues.size());
FetcherStatus.logTraceMisc(code, "\ttotal: " + allHostQueues.size());
FetcherStatus.logTraceMisc(code, "\tcached:" + hostQueueCache.size());
FetcherStatus.logTraceMisc(code, "HostQueues contain " + numQueuedRequests
+ " fetchList entries");
FetcherStatus.logTraceMisc(code,"FetchList is" +
(fetchListEmpty ? "" : " not")
+ " empty");
}
/**
* This method starts processing the <code>fetchList</code>, and
* does not return until processing is complete. The return value
* indicates error status; a return value of <code>false</code>
* means no errors were encountered, <code>true</code> means that
* the fetch was aborted.
*/
public boolean run() {
try {
primeQueue();
FetcherThread[] fetchers= new FetcherThread[numFetchThreads];
long now= new Date().getTime();
long lastStats= now;
long nextStats= lastStats
+ (STATS_MINUTES * 60 * SECONDS_TO_MS_MULTIPLIER);
long lastThrottle= now;
long nextThrottle= lastThrottle
+ (throttlePeriod * SECONDS_TO_MS_MULTIPLIER);
FetcherStatus lastStatus= null;
int curNumThreadsThrottled= 0;
int lastKbitsPerThread= 0;
for (int i= 0; i < numFetchThreads; i++) {
fetchers[i]= new FetcherThread(this);
if (throttleInitialThreads + i < numFetchThreads) {
fetchers[i].throttle();
curNumThreadsThrottled++;
}
fetchers[i].start();
}
overallFetcherStatus.logTraceMisc(
MISC_INFORMATIONAL,
"Starting with " + (numFetchThreads - curNumThreadsThrottled)
+ "/" + numFetchThreads + " fetcher threads active");
OutputThread[] outputers= new OutputThread[numOutputThreads];
for (int i= 0; i < numOutputThreads; i++) {
outputers[i]= new OutputThread(this, fetcherDb, rawDb, strippedDb);
outputers[i].start();
}
long nextSleep;
while (!finishedRequests() && !aborted) {
now= new Date().getTime();
if ( (nextStats < nextThrottle) || (throttlePeriod <= 0) || ( (throttleMaxBandwidth < 0)))
nextSleep= nextStats - now;
else
nextSleep= nextThrottle - now;
if (nextSleep < 0)
nextSleep = 0;
try {
Thread.sleep(nextSleep);
} catch (InterruptedException e) {
}
now= new Date().getTime();
if ( (now >= nextThrottle) && (throttlePeriod > 0)
&& (throttleMaxBandwidth > 0) ){
FetcherStatus currentFetcherStatus;
synchronized (this) {
synchronized (outputQueue) {
currentFetcherStatus= overallFetcherStatus.cloneStatus();
}
}
// get bandwidth over last period, kbits/s
int recentBandwidth;
if (lastStatus == null) {
recentBandwidth= currentFetcherStatus.getRawBandwidth();
} else {
FetcherStatus diffStatus=
currentFetcherStatus.getDelta(lastStatus);
recentBandwidth= diffStatus.getRawBandwidth();
if (LOG.isLoggable(Level.FINEST)) {
currentFetcherStatus.logStats();
lastStatus.logStats();
diffStatus.logStats();
}
}
if (recentBandwidth < 1)
recentBandwidth= 1;
// decide how many threads to throttle
int kbitsPerThread= recentBandwidth
/ (numFetchThreads - curNumThreadsThrottled);
int newNumThreadsThrottled=
numFetchThreads - (throttleMaxBandwidth / kbitsPerThread);
if (lastStatus != null) {
// smooth it with our last decision
newNumThreadsThrottled= (newNumThreadsThrottled
+ curNumThreadsThrottled) / 2;
}
if (lastKbitsPerThread < 1)
lastKbitsPerThread= 1;
int percentChangeInBandwidth=
(100 * (kbitsPerThread - lastKbitsPerThread) )
/ lastKbitsPerThread;
/*
Uncommenting this will cause the fetcher to increase
threads pretty conservatively- you will rarely go over
desired bandwidth in a period, but will average less, too.
// don't increase number of running threads if bandwidth
// per thread has dropped more than 10%!
if ( (curNumThreadsThrottled > newNumThreadsThrottled)
&& (percentChangeInBandwidth < -10) )
newNumThreadsThrottled= curNumThreadsThrottled;
*/
if (newNumThreadsThrottled >= numFetchThreads)
newNumThreadsThrottled= numFetchThreads - 1;
if (newNumThreadsThrottled < 0)
newNumThreadsThrottled= 0;
overallFetcherStatus.logTraceMisc(
MISC_INFORMATIONAL, "Current bandwidth: "
+ recentBandwidth + " kbits/s (" + kbitsPerThread
+ "kbits/s/thread )");
overallFetcherStatus.logTraceMisc(
MISC_INFORMATIONAL, "Adjusting the number of active fetcher"
+ " threads to " + (numFetchThreads - newNumThreadsThrottled)
+ "/" + numFetchThreads);
// throttle / unthrottle
while (curNumThreadsThrottled > newNumThreadsThrottled) {
curNumThreadsThrottled--;
fetchers[curNumThreadsThrottled].unthrottle();
}
while (curNumThreadsThrottled < newNumThreadsThrottled) {
fetchers[curNumThreadsThrottled].throttle();
curNumThreadsThrottled++;
}
// set up for next time
curNumThreadsThrottled= newNumThreadsThrottled;
lastStatus= currentFetcherStatus;
lastKbitsPerThread= kbitsPerThread;
lastThrottle= now;
nextThrottle+=
(THROTTLE_PERIOD_SECONDS * SECONDS_TO_MS_MULTIPLIER);
}
if (now >= nextStats) {
try {
overallFetcherStatus.logStats();
logState();
} catch (Exception e) {
e.printStackTrace();
}
lastStats= now;
nextStats+= (STATS_MINUTES * 60 * SECONDS_TO_MS_MULTIPLIER);
}
}
LOG.fine("Done requests");
// unthrottle any throttled FetcherThreads
while (curNumThreadsThrottled > 0) {
curNumThreadsThrottled--;
fetchers[curNumThreadsThrottled].unthrottle();
}
for (int i= 0; i < numFetchThreads; i++) {
fetchers[i].join();
}
while (!finishedOutput() && !aborted) {
Thread.sleep(1000);
}
LOG.fine("Done output");
for (int i= 0; i < numOutputThreads; i++) {
outputers[i].join();
}
overallFetcherStatus.logStats();
logState();
fetchList.close();
fetcherDb.close();
rawDb.close();
strippedDb.close();
} catch (Exception e) {
LOG.severe(e.toString());
e.printStackTrace();
}
return aborted;
}
/**
* Sets the log level to <code>level</code>.
*/
public void setLogLevel(Level level) {
LOG.setLevel(level);
Http.LOG.setLevel(level);
Ftp.LOG.setLevel(level);
RequestRecord.LOG.setLevel(level);
HostQueue.LOG.setLevel(level);
FetcherThread.LOG.setLevel(level);
OutputThread.LOG.setLevel(level);
}
/** Run the fetcher. */
public static void main(String[] args) throws Exception {
boolean verbose = false;
boolean showThreadID = false;
String directory = null;
String usage = "Usage: RequestScheduler [-verbose] [-showThreadID] dir";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-verbose")) { // found -verbose option
verbose = true;
} else if (args[i].equals("-showThreadID")) {
showThreadID = true;
} else if (i != args.length-1) {
System.err.println(usage);
System.exit(-1);
} else // root is required parameter
directory = args[i];
}
File doneFile = new File(directory, FetcherOutput.DONE_NAME);
if (doneFile.exists()) // check done file
throw new RuntimeException("already fetched: " + doneFile + " exists");
ArrayFile.Reader fetchList = new ArrayFile.Reader
(new File(directory, FetchListEntry.DIR_NAME).toString());
ArrayFile.Writer fetcherDb = new ArrayFile.Writer
(new File(directory, FetcherOutput.DIR_NAME).toString(),
FetcherOutput.class);
ArrayFile.Writer rawDb = new ArrayFile.Writer
(new File(directory, FetcherContent.DIR_NAME).toString(),
FetcherContent.class);
ArrayFile.Writer strippedDb = new ArrayFile.Writer
(new File(directory, FetcherText.DIR_NAME).toString(),
FetcherText.class);
RequestScheduler scheduler = new RequestScheduler(fetchList, fetcherDb,
rawDb, strippedDb);
// 20040405, xing
if (showThreadID)
LogFormatter.setShowThreadIDs(showThreadID);
scheduler.setLogLevel(verbose ? Level.FINER : Level.INFO);
boolean aborted= scheduler.run(); // run the Fetcher
if (aborted)
// create the error file
new File(directory, FetcherOutput.ERROR_NAME).createNewFile();
else
// create the done file
doneFile.createNewFile();
}
}